Set up

set.seed(2021)

library(MouseGastrulationData)
library(ggplot2)
library(reshape)
library(org.Mm.eg.db)

Load functions

source("../scripts/StabMap_functions.R")

Load some data using MouseGastrulationData

Sample 29, corresponding to E8.5 timepoint. And cell type colours for plotting.

SCE <- EmbryoAtlasData(samples = 29)
SCE
## class: SingleCellExperiment 
## dim: 29452 7569 
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(7569): cell_95727 cell_95728 ... cell_103294 cell_103295
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
celltype_colours = MouseGastrulationData::EmbryoCelltypeColours
celltype_colours
##                       Epiblast               Primitive Streak 
##                      "#635547"                      "#DABE99" 
##                Caudal epiblast                            PGC 
##                      "#9E6762"                      "#FACB12" 
##      Anterior Primitive Streak                      Notochord 
##                      "#C19F70"                      "#0F4A9C" 
##                  Def. endoderm                            Gut 
##                      "#F397C0"                      "#EF5A9D" 
##               Nascent mesoderm                 Mixed mesoderm 
##                      "#C594BF"                      "#DFCDE4" 
##          Intermediate mesoderm                Caudal Mesoderm 
##                      "#139992"                      "#3F84AA" 
##              Paraxial mesoderm               Somitic mesoderm 
##                      "#8DB5CE"                      "#005579" 
##            Pharyngeal mesoderm                 Cardiomyocytes 
##                      "#C9EBFB"                      "#B51D8D" 
##                      Allantois                   ExE mesoderm 
##                      "#532C8A"                      "#8870AD" 
##                     Mesenchyme Haematoendothelial progenitors 
##                      "#CC7818"                      "#FBBE92" 
##                    Endothelium            Blood progenitors 1 
##                      "#FF891C"                      "#F9DECF" 
##            Blood progenitors 2                     Erythroid1 
##                      "#C9A997"                      "#C72228" 
##                     Erythroid2                     Erythroid3 
##                      "#F79083"                      "#EF4E22" 
##                            NMP           Rostral neurectoderm 
##                      "#8EC792"                      "#65A83E" 
##            Caudal neurectoderm                   Neural crest 
##                      "#354E23"                      "#C3C388" 
##   Forebrain/Midbrain/Hindbrain                    Spinal cord 
##                      "#647A4F"                      "#CDE088" 
##               Surface ectoderm              Visceral endoderm 
##                      "#F7F79E"                      "#F6BFCB" 
##                   ExE endoderm                   ExE ectoderm 
##                      "#7F6874"                      "#989898" 
##              Parietal endoderm 
##                      "#1A1A1A"

Generate some subsets of genes

random_500 = sample(rownames(SCE), 500)
random_1000 = sample(rownames(SCE), 1000)
random_2000 = sample(rownames(SCE), 2000)
random_5000 = sample(rownames(SCE), 5000)

Generate similarity for all genes

This will be re-used multiple times. Note this is a dense matrix, as output from igraph::similarity.

full_sim = generateSimilarity(SCE)
dim(full_sim)
## [1] 7569 7569
full_sim[1:5,1:5]
##            cell_95727 cell_95728 cell_95729 cell_95730 cell_95731
## cell_95727          1          0          0          0          0
## cell_95728          0          1          0          0          0
## cell_95729          0          0          1          0          0
## cell_95730          0          0          0          1          0
## cell_95731          0          0          0          0          1

Calculate uncertainty scores for random subsets of genes

If plot = TRUE then a UMAP embedding is generated among the subset features. I can add an additional UMAP scatterplot with more information using the plotAdditional argument, which is itself a list of the proto objects to add, here is colouring the cells based on their cell type, with additional ggplot style arguments passed in the second object within the list.

plotAdditional = list("celltype", 
                      list(scale_colour_manual(values = celltype_colours),
                        theme(legend.position = "bottom")))
scores_500 = getSubsetUncertainty(SCE,
                                  subsetGenes = random_500,
                                  full_sim = full_sim,
                                  plot = TRUE,
                                  plotAdditional = plotAdditional)

scores_1000 = getSubsetUncertainty(SCE,
                                  subsetGenes = random_1000,
                                  full_sim = full_sim,
                                  plot = TRUE,
                                  plotAdditional = plotAdditional)

scores_2000 = getSubsetUncertainty(SCE,
                                  subsetGenes = random_2000,
                                  full_sim = full_sim,
                                  plot = TRUE,
                                  plotAdditional = plotAdditional)

scores_5000 = getSubsetUncertainty(SCE,
                                  subsetGenes = random_5000,
                                  full_sim = full_sim,
                                  plot = TRUE,
                                  plotAdditional = plotAdditional)

Some exploration of these scores

scores_df = data.frame(cell = colnames(SCE),
                       score_500 = scores_500,
                       score_1000 = scores_1000,
                       score_2000 = scores_2000,
                       score_5000 = scores_5000,
                       celltype = SCE$celltype)

As the number of random genes increases, the uncertainty score tends to decrease.

ggplot(scores_df, aes(x = score_500, y = score_1000)) + 
  geom_point(aes(colour = celltype)) + 
  theme_classic() + 
  geom_abline(slope = 1, intercept = 0) +
  scale_colour_manual(values = celltype_colours) +
  NULL

ggplot(scores_df, aes(x = score_1000, y = score_2000)) + 
  geom_point(aes(colour = celltype)) + 
  theme_classic() + 
  geom_abline(slope = 1, intercept = 0) +
  scale_colour_manual(values = celltype_colours) +
  NULL

ggplot(scores_df, aes(x = score_2000, y = score_5000)) + 
  geom_point(aes(colour = celltype)) + 
  theme_classic() + 
  geom_abline(slope = 1, intercept = 0) +
  scale_colour_manual(values = celltype_colours) +
  NULL

The decrease in uncertainty score is varied between cell types, e.g. for Gut this is quite pronounced, while not so much for the Erythroid types.

scores_df_long = melt(scores_df, id.vars = c("cell", "celltype"))
ggplot(scores_df_long, aes(x = interaction(variable, celltype), y = value)) + 
  geom_boxplot(aes(fill = celltype)) +
  theme_classic() +
  scale_fill_manual(values = celltype_colours) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("") +
  theme(legend.position = "bottom") +
  NULL

Example using a dataset with non-overlapping features

I can also estimate the uncertainty for new data, given the subset. Here, I extract another sample from the MouseGastrulationData package, sample 36, corresponding to again a E8.5 timepoint. For this sample I restrict to genes that are related to cell cycle.

cycle.anno <- select(org.Mm.eg.db, keytype="GOALL", keys="GO:0007049", 
    columns="ENSEMBL")[,"ENSEMBL"]
cellCycleGenes = intersect(cycle.anno, rownames(SCE))
head(cellCycleGenes)
## [1] "ENSMUSG00000026842" "ENSMUSG00000029580" "ENSMUSG00000026836"
## [4] "ENSMUSG00000000532" "ENSMUSG00000052593" "ENSMUSG00000022893"
length(cellCycleGenes)
## [1] 1635
SCE_query <- EmbryoAtlasData(samples = 36)
SCE_query
## class: SingleCellExperiment 
## dim: 29452 4915 
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
##   ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(4915): cell_130406 cell_130407 ... cell_135319 cell_135320
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):

Calculate uncertainty scores for both datasets, using the cell cycle genes. This includes a call to cbind of the SingleCellExperiment objects, which is not trivial and should be used with care.

scores_CC = getSubsetUncertainty(SCE,
                                 querySCE = SCE_query,
                                 subsetGenes = cellCycleGenes,
                                 full_sim = full_sim,
                                 plot = TRUE,
                                 plotAdditional = plotAdditional)

length(scores_CC)
## [1] 12484
head(scores_CC)
## cell_95727 cell_95728 cell_95729 cell_95730 cell_95731 cell_95732 
##  0.1600000  0.6163265  0.1542857  0.5746939  0.6481633  0.6800000
scores_cc = data.frame(cell = names(scores_CC),
                       score = scores_CC,
                       celltype = cbind(SCE, SCE_query)[,names(scores_CC)]$celltype,
                       sample = cbind(SCE, SCE_query)[,names(scores_CC)]$sample)

ggplot(scores_cc, aes(x = interaction(sample, celltype), y = score)) + 
  geom_boxplot(aes(fill = celltype)) + 
  theme_classic() +
  scale_fill_manual(values = celltype_colours) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  xlab("") +
  theme(legend.position = "bottom") +
  NULL

sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /usr/local/Cellar/openblas/0.3.13/lib/libopenblasp-r0.3.13.dylib
## LAPACK: /usr/local/Cellar/r/4.0.4/lib/R/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] patchwork_1.1.1             BiocNeighbors_1.8.2        
##  [3] scuttle_1.0.4               scater_1.18.3              
##  [5] igraph_1.2.6                bluster_1.0.0              
##  [7] scran_1.18.3                org.Mm.eg.db_3.12.0        
##  [9] AnnotationDbi_1.52.0        reshape_0.8.8              
## [11] ggplot2_3.3.3               MouseGastrulationData_1.4.0
## [13] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [15] Biobase_2.50.0              GenomicRanges_1.42.0       
## [17] GenomeInfoDb_1.26.2         IRanges_2.24.1             
## [19] S4Vectors_0.28.1            BiocGenerics_0.36.0        
## [21] MatrixGenerics_1.2.0        matrixStats_0.57.0         
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                  bit64_4.0.5                  
##  [3] RcppAnnoy_0.0.18              httr_1.4.2                   
##  [5] tools_4.0.4                   R6_2.5.0                     
##  [7] irlba_2.3.3                   vipor_0.4.5                  
##  [9] uwot_0.1.10                   DBI_1.1.1                    
## [11] colorspace_2.0-0              withr_2.4.1                  
## [13] gridExtra_2.3                 tidyselect_1.1.0             
## [15] bit_4.0.4                     curl_4.3                     
## [17] compiler_4.0.4                DelayedArray_0.16.1          
## [19] labeling_0.4.2                scales_1.1.1                 
## [21] rappdirs_0.3.3                stringr_1.4.0                
## [23] digest_0.6.27                 rmarkdown_2.6                
## [25] XVector_0.30.0                pkgconfig_2.0.3              
## [27] htmltools_0.5.1.1             sparseMatrixStats_1.2.0      
## [29] dbplyr_2.1.0                  fastmap_1.1.0                
## [31] limma_3.46.0                  rlang_0.4.10                 
## [33] RSQLite_2.2.3                 shiny_1.6.0                  
## [35] DelayedMatrixStats_1.12.2     farver_2.0.3                 
## [37] generics_0.1.0                BiocParallel_1.24.1          
## [39] dplyr_1.0.3                   RCurl_1.98-1.2               
## [41] magrittr_2.0.1                BiocSingular_1.6.0           
## [43] GenomeInfoDbData_1.2.4        Matrix_1.3-2                 
## [45] ggbeeswarm_0.6.0              Rcpp_1.0.6                   
## [47] munsell_0.5.0                 viridis_0.5.1                
## [49] lifecycle_0.2.0               stringi_1.5.3                
## [51] yaml_2.2.1                    edgeR_3.32.1                 
## [53] zlibbioc_1.36.0               plyr_1.8.6                   
## [55] BiocFileCache_1.14.0          AnnotationHub_2.22.0         
## [57] grid_4.0.4                    blob_1.2.1                   
## [59] promises_1.1.1                dqrng_0.2.1                  
## [61] ExperimentHub_1.16.0          crayon_1.3.4                 
## [63] lattice_0.20-41               cowplot_1.1.1                
## [65] beachmat_2.6.4                locfit_1.5-9.4               
## [67] knitr_1.30                    pillar_1.4.7                 
## [69] codetools_0.2-18              glue_1.4.2                   
## [71] BiocVersion_3.12.0            evaluate_0.14                
## [73] BiocManager_1.30.10           vctrs_0.3.6                  
## [75] httpuv_1.5.5                  gtable_0.3.0                 
## [77] purrr_0.3.4                   assertthat_0.2.1             
## [79] cachem_1.0.1                  xfun_0.20                    
## [81] rsvd_1.0.3                    mime_0.9                     
## [83] xtable_1.8-4                  RSpectra_0.16-0              
## [85] later_1.1.0.1                 viridisLite_0.3.0            
## [87] tibble_3.0.5                  beeswarm_0.2.3               
## [89] memoise_2.0.0                 statmod_1.4.35               
## [91] ellipsis_0.3.1                interactiveDisplayBase_1.28.0